---
title: "Y02 Network Data Analysis"
output: html_document
date: "2024-08-22"
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

rm(list = ls())
cat("\014")

library(vroom)
library(tidyverse)
library(httr)
library(jsonlite)
library(vroom)
library(ggplot2)
library(ggspatial)
library(ggthemes)
library(DescTools)
library(knitr)
library(kableExtra)
library(igraph)
library(ggraph)
library(tidygraph)
library(ggrepel)

options(scipen = 999)
```


*Load Data* 

*All US inventor assigned Y02 patents granted by USPTO - including their citation connections to US inventors*


```{r}
sim <- vroom("data/y02_w_citations.csv")

n_distinct(sim$patent_id)
```


*We look at all Y02 patents between 2000-2022 with citation to patents >= 1999 *

```{r}
#sim$patent_year <- substr(sim$patent_date, 1, 4)

sim <- sim %>%
  filter(patent_year >= 2000 & patent_year < 2025 & citation_year >= 1999) %>%
  distinct(patent_id, citation_patent_id, .keep_all = T)
```


*Number of unique Patents*
```{r}
n_distinct(sim$patent_id)
```


```{r}
gov <- sim %>%
  filter(!is.na(fedagency_name)) %>%
  distinct(patent_id, .keep_all = T)


doe <- gov %>%
  filter(assignee_organization %in% c("Department of Energy"))

dod <- gov %>%
  filter(assignee_organization %in% c("Department of Defense"))
```


```{r}
2250 / 11721
```
```{r}
4796 / 11721
```


*Number of citation connections*
```{r}
citations <- sim %>%
  distinct(patent_id, citation_patent_id, .keep_all = T)
```




```{r}
sim$cited_fedagency_name
```




*Assign federally funded patents to funding agency*

```{r}
sim <- sim %>%
  rename(assignee_organization = disambig_assignee_organization) %>%
  rename(cited_gov_org_name = cited_fedagency_name) %>%
  mutate(
    assignee_organization = if_else(!is.na(level_one), level_one, assignee_organization),
    cited_organization = if_else(!is.na(cited_level_one), cited_level_one, cited_organization)
  )

```



*Standardize organization names*
```{r}
library(dplyr)
library(stringr)

# Extended pattern list
org_patterns <- list(
  "NASA" = "nasa|national aeronautics and space administration|the united states of america as represented by the administrator of the national aeronautics and space administration|PPP NASA",
  "Department of Energy" = "department of energy|sandia|battelle|alliance for sustainable energy.*|uchicago argonne|united states energy research and development administration|lawrence livermore national laboratory|argonne national laboratory|lawrence berkeley national laboratory|Battell Energy Alliance, LLC|PPP DOE|Pacific Northwest National Laboratory|
National Energy Technology Laboratory|National Renewable Energy Laboratory|
Los Alamos National Laboratory|National Energy Technology Laboratory",
  "Department of Defense" = "army|navy|air force|PPP DOD|Naval Air Systems Command|Defense Advanced Research Projects Agency|Naval Air Warfare Center",
  "SunPower" = "sunpower",
  "Boeing" = "boeing",
  "NI Standards" = "national institute of standards and technology|the united states of america, as represented by the secretary of commerce national institute of standards and technology|14th and constitution national institute of standards and technology",
"DHHS" = "health and human services"
)

# Apply standardization
for (org in names(org_patterns)) {
  pattern <- org_patterns[[org]]
  sim <- sim %>%
    mutate(
      assignee_organization = if_else(str_detect(assignee_organization, regex(pattern, ignore_case = TRUE)), org, assignee_organization),
      cited_organization = if_else(str_detect(cited_organization, regex(pattern, ignore_case = TRUE)), org, cited_organization)
    )
}

```

*Number of distinct patents now*

```{r}
test <- sim %>%
  filter(!assignee_organization == cited_organization)

n_distinct(test$patent_id)
```


*Turn into network data object* 
```{r}

forward <- sim %>%
  filter(!is.na(cited_organization)) %>%
  dplyr::select(assignee_organization, cited_organization, patent_year) %>%
  rename(source = cited_organization, destination = assignee_organization, year_citation_was_made = patent_year)

sources <- forward %>%
  ungroup() %>%
  distinct(source, .keep_all = T) %>%
  dplyr::select(source) %>%
  rename(label = source)

#Create edge list
#
destinations <- forward %>%
  ungroup() %>%
  distinct(destination, .keep_all = T) %>%
  dplyr::select(destination) %>%
  rename(label = destination)

#sources$label <- as.numeric(sources$label)
  
#join
nodes <- full_join(sources, destinations, by = "label")

#create id column
nodes <- nodes %>% rowid_to_column("id")

#Create edgelist
per_route <- forward %>%
  group_by(source, destination, year_citation_was_made) %>%
  summarise(weight = n()) %>%
  ungroup()
#per_route

#per_route$source <- as.numeric(per_route$source)

edges <- per_route %>%
  left_join(nodes, by = c("source" = "label")) %>%
  rename(from = id)

edges <- edges %>%
  left_join(nodes, by = c("destination" = "label")) %>%
  rename(to = id) %>%
  rename(year = year_citation_was_made)

edges <- dplyr::select(edges, from, to, weight, year)


#nodes <- nodes %>%
#  distinct(label, .keep_all = T)

#We need to add a type attribute based on university, government agency, firm

nodes <- nodes %>%
  mutate(type = case_when(grepl("university|college|TEES|colorado school of mines|massachusetts institute|naval academy|CUNY Energy Institute|Marine Biological Laboratory|california institute|
                                rensselaer polytechnic institute|montana state|
                                rochester institute of technology|naval post graduate school|
                                ndsu|new jersey institute of technology|State of Oregon acting by and through the State Board of Higher Ed. on Behalf of Oregon State Univ.|
                                Rochester Institute of Technology|
                                Board of Regents of the Nevada System of Higher Education on behalf of the Desert Research Institute", label, ignore.case = TRUE) ~ "University",
                          grepl("national lab|U.S. Naval Research|Air Force|Naval Air|national renewable energy lab|Massachusetts Clean Energy|NASA Ames Research Center|
                                National Energy Technology Laboratory|National Institute of Standards and Technology|National Rural Electric Cooperative Association|New York State Smart Grid Consortium|
                                Princeton Plasma Physics Laboratory|SLAC National Accelerator Laboratory|U.S. Army Research Laboratory|United States Navy|national energy technology laboratory|
                                princeton plasma physics laboratory|usda - agriculture research service|
                                weapons division research department (nawcwd)|navfac/nps|dr. vladimir goren|ilya vitebskiy|usda-ars|research & innovation center|usda forest products laboratory|
                                cold regions research and engineering laboratory|
                                engine research center (erc)|nasa george c marshall space flight center|jet propulsion laboratory|
                                National Technology & Engineering Solutions of Sandia, LLC|
                                United States of America as represented by the Administrator of the National Aeronautics and Space Administration|
                                United States of America as represented by the Administrator of NASA|
                                The Government of the United States of America, as represented by the Secretarv of the Navy|
                                The United States of America as represented by the Secretary of the Army|
                                United States Department of Energy|
                                THE UNITED STATES OF AMERICA AS REPRESENTED BY THE ADMINISTRATOR OF THE NATIONAL AERONAUTICS AND SPACE ADMINISTRATION|
                                The United States of America Department of Energy|
                                U.S. Department of Energy|
                                The United States of America as represented by the Secretary of the Department of Health and Human Services, c/o Centers for Disease Control and Prevention|Department of Energy|
                                The United States of America as represented by the United States Energy Research and Development Administration|
                                The United States of America as represented by the United States National Aeronautics and Space Administration|
                                The United States of America as represented by the Administrator of NASA|
                                United States of America as represented by the United States Department of Energy|
                                Government of the United States of America, as represented by the Secretary of Commerce|
                                The United States of America, as represented by the Secretary of Agriculture|
                                The United States of America, as represented by The Secretary of The Interior|
                                The United States of America as represented by United States National Aeronautics and Space Administration|
                                The United States of America as represented by the Admistrator of NASA|NASA|Dept. of Energy|Sandia Corporation|Department of Defense|National Science Foundation|United States Government|DHHS|National Institutes of Health", label, ignore.case = TRUE) ~ "National Lab",
                          TRUE ~ "Firm"))


na <- nodes %>%
  filter(is.na(label))
#none
#removing self citations
edges_excl <- edges %>%
  filter(!from == to) %>%
  filter(from %in% nodes$id) %>%
  filter(to %in% nodes$id)

```

*Check organization names*

```{r}
library(dplyr)
library(tidyr)
library(stringdist)


label_pairs <- expand.grid(label1 = nodes$label, label2 = nodes$label, stringsAsFactors = FALSE) %>%
  as_tibble() %>%
  filter(label1 != label2) %>%
  distinct() %>%
  mutate(similarity = 1 - stringdist(label1, label2, method = "jw"))


similar_labels <- label_pairs %>%
  filter(similarity > 0.97)
```

*Back to standardization*

```{r}

# Step 1: Create a group key before pivoting
similar_labels <- similar_labels %>%
  mutate(group_key = pmin(label1, label2))  # alphabetically first label

name_map <- similar_labels %>%
  pivot_longer(cols = c(label1, label2), values_to = "raw_name", names_to = NULL) %>%
  distinct(group_key, raw_name) %>%
  mutate(raw_name = as.character(raw_name)) %>%  # convert to character
  group_by(group_key) %>%
  mutate(canonical = min(raw_name)) %>%
  ungroup() %>%
  dplyr::select(raw_name, canonical)

```


```{r}
# Standardize assignee_organization
sim <- sim %>%
  left_join(name_map, by = c("assignee_organization" = "raw_name")) %>%
  mutate(assignee_organization = coalesce(canonical, assignee_organization)) %>%
  dplyr::select(-canonical)

# Standardize cited_organization
sim <- sim %>%
  left_join(name_map, by = c("cited_organization" = "raw_name")) %>%
  mutate(cited_organization = coalesce(canonical, cited_organization)) %>%
  dplyr::select(-canonical)

```

```{r}
sim <- sim %>%
  distinct(patent_id, citation_patent_id, .keep_all = T)
```

42,213 nodes 

# 49,039 nodes

Number of citation connections: 
```{r}
sum(edges_excl$weight)
```

Number of edges: 
```{r}
nrow(edges_excl)
```




```{r}
write.csv(nodes, "nodes_y02.csv")

write.csv(edges_excl, "edges_excl_y02.csv")
```




*Turn into igraph object*
```{r}
net <- graph_from_data_frame(d = edges_excl, vertices = nodes, directed = T)
```





*OutDegree values*

```{r}
# Calculate the out-degree of each vertex
g.outd <- degree(net, mode = c("out"))

#Create a dataframe of the out-degree distribution 

# Convert named vector to a long dataframe
long_deg <- data.frame(id = names(g.outd), Value = g.outd) %>%
  gather(key = "id", value = "Value") %>%
  mutate(id = as.numeric(id))

long_deg <- left_join(long_deg, nodes)

ordered_out_deg <- long_deg %>%
  arrange(desc(Value))

subset_out_deg <- head(ordered_out_deg, n = 20)

kable(subset_out_deg)
```

```{r}
# Calculate 99th percentile of out-degrees
quantile_99_outdeg <- quantile(g.outd, probs = 0.99, na.rm = TRUE)

print(quantile_99_outdeg)

```


*citations*

```{r}
outgoing_weights <- strength(net, mode = "out", loops = FALSE)

# Convert named vector to a long dataframe
long_cit <- data.frame(id = names(outgoing_weights), Value = outgoing_weights) %>%
  gather(key = "id", value = "Value") %>%
  mutate(id = as.numeric(id))

long_cit <- left_join(long_cit, nodes)

ordered_out_cit <- long_cit %>%
  arrange(desc(Value))

subset_out_cit <- head(ordered_out_cit, n = 15)

kable(subset_out_cit)
```

```{r}
quantile_99_outgoing <- quantile(outgoing_weights, probs = 0.99, na.rm = TRUE)

print(quantile_99_outgoing)

```



*Hub Score*

```{r}
hs <- hub_score(net, weights=NA)$vector

top_twenty <- hs[order(hs, decreasing = TRUE)[1:20]]

df <- as.data.frame(top_twenty)

df <- cbind(id = rownames(df), df)
rownames(df) <- 1:nrow(df)
df$id <- as.integer(df$id)

df <- left_join(df, nodes)

kable(df)
```

```{r}
quantile_99_hs <- quantile(hs, probs = 0.99, na.rm = TRUE)

print(quantile_99_hs)

```



```{r}
node_attributes <- V(net)$label

node_attributes


```


2-DOE 
44 DOD

```{r}
library(ggrepel)
library(MASS)
library(RColorBrewer) 

# Here, I am specifying a specific color palette -- instead of using ggplot's 
# defaults. 

colors <- brewer.pal(n = 5, "Set1") 
# Here, I am choosing specific colors to use below

red <- colors[1] 
blue <- colors[2] 
green <- colors[3]
orange <- colors[4]
purple <- colors[5]
```


*Calculate Annual Statistics*

```{r}
calculate_outdegrees_for_nodes <- function(net, node_labels, year_range) {
  outdegree_data <- data.frame(Year = year_range)
  
  for (label in node_labels) {
    outdegree_data[[label]] <- numeric(length(year_range))
    
    for (i in 1:length(year_range)) {
      year <- year_range[i]
      yearly_network <- subgraph.edges(net, E(net)[year == year_range[i]])
      weighted_outdegree <- degree(yearly_network, mode = "out")
      specific_outdegree <- weighted_outdegree[label]
      outdegree_data[[label]][i] <- specific_outdegree
    }
  }
  
  return(outdegree_data)
}

node_labels <- c("2", "44")  
year_range <- 2000:2024  

outdegree_dataframe <- calculate_outdegrees_for_nodes(net, node_labels, year_range)
```


```{r}
# Define a function to calculate the 75th percentile of outdegree values for each year
calculate_percentile_outdegrees <- function(net, year_range) {
  percentile_data <- data.frame(Year = year_range, Percentile = numeric(length(year_range)))
  
  for (i in 1:length(year_range)) {
    year <- year_range[i]
    yearly_network <- subgraph.edges(net, E(net)[year == year_range[i]])
    outdegrees <- degree(yearly_network, mode = "out")
    percentile_75 <- quantile(outdegrees, 0.999, na.rm = TRUE)
    percentile_data$Percentile[i] <- percentile_75
  }
  
  return(percentile_data)
}

percentile_outdegrees_dataframe <- calculate_percentile_outdegrees(net, year_range)

#############


names <- c("year", "DOE", "DOD")

colnames(outdegree_dataframe) <- names

# left joining the 99th percentile
outdegree_dataframe <- cbind(outdegree_dataframe, percentile_outdegrees_dataframe)

outdegree_dataframe <- outdegree_dataframe %>%
  dplyr::select(-Year)

outdegree_long <- outdegree_dataframe %>%
  pivot_longer(cols = 2:4, names_to = "org", values_to = "citations")

deg <- outdegree_long %>%
  filter(org %in% c("DOE", "DOD", "Percentile")) %>%
  filter(year < 2024) %>%
  ggplot(aes(x = year, y = citations, group = org, color = org)) +
  geom_line(aes(linetype = org)) +
  theme_clean() +
  xlab("") + ylab("Out Degree") + 
  theme(legend.position = "none") +
  scale_color_manual(
    values = c("DOE" = green, 
               "DOD" = red, 
               "Percentile" = blue)) +
  scale_linetype_manual(
     values = c("DOE" = "longdash", 
                "DOD" = "solid",
                "Percentile" = "dotted")) +
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14,face="bold"),
        legend.position = "none") + 
  scale_y_continuous(breaks = c(20, 200, 400, 600))  +
  scale_x_continuous(breaks = c(2000, 2005, 2010, 2015, 2020, 2023)) 


deg
```



```{r}
deg <- outdegree_long %>%
  filter(org %in% c("DOE", "DOD", "Percentile")) %>%
  filter(year <= 2023) %>%
  ggplot(aes(x = year, y = citations, group = org, color = org, linetype = org)) +
  geom_line(size = 1.2) +  # Thicker lines
  theme_clean() +
  xlab("") + 
  ylab("Out Degree") + 
  scale_color_manual(
    values = c("DOE" = green, 
               "DOD" = red, 
               "Percentile" = blue)) +
  scale_linetype_manual(
    values = c("DOE" = "longdash", 
               "DOD" = "solid",
               "Percentile" = "dotted")) +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 14, face = "bold"),
        legend.position = "none") +
  scale_y_continuous(breaks = c(20, 200, 400, 600)) +
  annotate("text", x = 2005, y = 375, label = "DOE", color = green, size = 5, hjust = 0) +
  annotate("text", x = 2017, y = 300, label = "DOD", color = red, size = 5, hjust = 0) +
  annotate("text", x = 2012, y = 90, label = "99th Percentile", color = blue, size = 4, hjust = 0) +
  scale_x_continuous(breaks = c(2000, 2005, 2010, 2015, 2023))


deg
```



*Calculate Annual Hub Scores* 

```{r}
# Define the function to calculate hub scores
calculate_hub_scores_for_nodes <- function(net, node_labels, year_range) {
  hub_score_data <- data.frame(Year = year_range)
  
  for (label in node_labels) {
    hub_score_data[[label]] <- numeric(length(year_range))
    
    for (i in 1:length(year_range)) {
      year <- year_range[i]
      yearly_network <- subgraph.edges(net, E(net)[year == year_range[i]])
      hub_scores <- hub_score(yearly_network, weights = NA)$vector
      specific_hub_score <- hub_scores[label]
      hub_score_data[[label]][i] <- specific_hub_score
    }
  }
  
  return(hub_score_data)
}

node_labels <- c("2", "44")  
 

year_range <- 2000:2024  

hub_score_dataframe <- calculate_hub_scores_for_nodes(net, node_labels, year_range)

calculate_percentile_hub_scores <- function(net, year_range) {
  percentile_data <- data.frame(Year = year_range, Percentile = numeric(length(year_range)))
  
  for (i in 1:length(year_range)) {
    year <- year_range[i]
    yearly_network <- subgraph.edges(net, E(net)[year == year_range[i]])
    hub_scores <- hub_score(yearly_network, weights = NA)$vector
    percentile_75 <- quantile(hub_scores, 0.99, na.rm = TRUE)
    percentile_data$Percentile[i] <- percentile_75
  }
  
  return(percentile_data)
}


percentile_hub_scores_dataframe <- calculate_percentile_hub_scores(net, year_range)

names <- c("year", "DOE", "DOD")

colnames(hub_score_dataframe) <- names

hub_score_dataframe <- cbind(hub_score_dataframe, percentile_hub_scores_dataframe)

hub_score_dataframe <- hub_score_dataframe %>%
  dplyr::select(-Year)


hub_long <- hub_score_dataframe %>%
  pivot_longer(cols = 2:4, names_to = "org", values_to = "citations")

hub <- hub_long %>%
  filter(org %in% c("DOE", "DOD", "Percentile")) %>%
  filter(year <= 2023) %>%
  ggplot(aes(x = year, y = citations, group = org, color = org, linetype = org)) +
  geom_line(size = 1.2) +  # Corrected size placement
  theme_clean() +
  xlab("") + 
  ylab("Hub Score") + 
  scale_color_manual(
    values = c("DOE" = green, 
               "DOD" = red, 
               "Percentile" = blue)) +
  scale_linetype_manual(
    values = c("DOE" = "longdash", 
               "DOD" = "solid",
               "Percentile" = "dotted")) +
  scale_y_continuous() +  # You can customize breaks if needed
  theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14, face = "bold"),
    legend.position = "none"
  ) +
  scale_x_continuous(breaks = c(2000, 2005, 2010, 2015, 2023))


hub
```




*Annual outgoing citations* 

```{r}

# Define the function to calculate outgoing citations
calculate_outgoing_citations_for_nodes <- function(net, node_labels, year_range) {
  citations_data <- data.frame(Year = year_range)
  
  for (label in node_labels) {
    citations_data[[label]] <- numeric(length(year_range))
    
    for (i in 1:length(year_range)) {
      year <- year_range[i]
      yearly_network <- subgraph.edges(net, E(net)[year == year_range[i]])
      outgoing_citations <- sum(E(yearly_network)[.from(label)]$weight)
      citations_data[[label]][i] <- outgoing_citations
    }
  }
  
  return(citations_data)
}


node_labels <- c("2", "44") 
year_range <- 2000:2023 

citations_dataframe <- calculate_outgoing_citations_for_nodes(net, node_labels, year_range)

calculate_percentile_citations <- function(net, year_range) {
  percentile_data <- data.frame(Year = year_range, Percentile = numeric(length(year_range)))
  
  for (i in 1:length(year_range)) {
    year <- year_range[i]
    yearly_network <- subgraph.edges(net, E(net)[year == year_range[i]])
    citation_values <- E(yearly_network)$weight
    percentile_75 <- quantile(citation_values, 0.99, na.rm = TRUE)
    percentile_data$Percentile[i] <- percentile_75
  }
  
  return(percentile_data)
}

percentile_citations_dataframe <- calculate_percentile_citations(net, year_range)

names <- c("year", "DOE", "DOD")

colnames(citations_dataframe) <- names

# left joining the 99th percentile
citations_dataframe<- cbind(citations_dataframe, percentile_citations_dataframe)

citations_dataframe <- citations_dataframe %>%
    dplyr::select(-Year)


citations_long <- citations_dataframe %>%
  pivot_longer(cols = 2:4, names_to = "org", values_to = "citations")

 
cit <- citations_long %>%
  filter(org %in% c("DOE", "DOD", "Percentile")) %>%
  filter(year <= 2023) %>%
  ggplot(aes(x = year, y = citations, group = org, color = org, linetype = org)) +
  geom_line(size = 1.2) +  # Thicker lines
  theme_clean() +
  xlab("") + 
  ylab("Citations") + 
  scale_color_manual(
    values = c("DOE" = green, 
               "DOD" = red, 
               "Percentile" = blue)) +
  scale_linetype_manual(
    values = c("DOE" = "longdash", 
               "DOD" = "solid",
               "Percentile" = "dotted")) +
  scale_y_continuous(breaks = c(10, 1000, 2000, 3000)) +
  theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14, face = "bold"),
    legend.position = "none"
  ) +
  scale_x_continuous(breaks = c(2000, 2005, 2010, 2015, 2023))


cit
```



*Eigenvectors*

```{r}
# Define the function to calculate eigenvector centrality
calculate_eigenvector_centrality_for_nodes <- function(net, node_labels, year_range) {
  centrality_data <- data.frame(Year = year_range)
  
  for (label in node_labels) {
    centrality_data[[label]] <- numeric(length(year_range))
    
    for (i in 1:length(year_range)) {
      year <- year_range[i]
      yearly_network <- subgraph.edges(net, E(net)[year == year_range[i]])
      eigenvector_centrality <- eigen_centrality(yearly_network)$vector
      specific_eigenvector_centrality <- eigenvector_centrality[label]
      centrality_data[[label]][i] <- specific_eigenvector_centrality
    }
  }
  
  return(centrality_data)
}

node_labels <- c("2", "44")  
year_range <- 2000:2024

eigenvector_centrality_dataframe <- calculate_eigenvector_centrality_for_nodes(net, node_labels, year_range)

## 

# Define a function to calculate the 75th percentile of eigen centrality values for each year
calculate_percentile_eigen_centrality <- function(net, year_range) {
  percentile_data <- data.frame(Year = year_range, Percentile = numeric(length(year_range)))
  
  for (i in 1:length(year_range)) {
    year <- year_range[i]
    yearly_network <- subgraph.edges(net, E(net)[year == year_range[i]])
    eigen_centrality <- eigen_centrality(yearly_network)$vector
    percentile_75 <- quantile(eigen_centrality, 0.99, na.rm = TRUE)
    percentile_data$Percentile[i] <- percentile_75
  }
  
  return(percentile_data)
}

percentile_eigen_centrality_dataframe <- calculate_percentile_eigen_centrality(net, year_range)

####

names <- c("year", "DOE", "DOD")

colnames(eigenvector_centrality_dataframe) <- names

# left joining the 99th percentile
eigenvector_centrality_dataframe <- cbind(eigenvector_centrality_dataframe, percentile_eigen_centrality_dataframe)

eigenvector_centrality_dataframe <- eigenvector_centrality_dataframe %>%
    dplyr::select(-Year)



colnames(eigenvector_centrality_dataframe)[4]  <- "99th Perc."


eigen_long <- eigenvector_centrality_dataframe %>%
  pivot_longer(cols = 2:4, names_to = "org", values_to = "EigenVec")

```



```{r}
eig <- eigen_long %>%
  filter(org %in% c("DOE", "DOD", "99th Perc.")) %>%
  filter(year <= 2023) %>%
  ggplot(aes(x = year, y = EigenVec, group = org, color = org, linetype = org)) +
  geom_line(size = 1.2) +  # Thicker lines
  theme_clean() +
  xlab("") + 
  ylab("Eigenvector") +
  scale_color_manual(
    values = c("DOE" = green, 
               "DOD" = red, 
               "99th Perc." = blue)
  ) +
  scale_linetype_manual(
    values = c("DOE" = "longdash", 
               "DOD" = "solid",
               "99th Perc." = "dotted")
  ) +
  theme(
    axis.text = element_text(size = 13),
    axis.title = element_text(size = 13, face = "bold"),
    legend.position = "none"  # Remove legend
  ) +
  scale_x_continuous(breaks = c(2000, 2005, 2010, 2015, 2023))

 

eig
```



```{r}

library(cowplot)
library(ggpubr)
library(gridExtra)
library(grid)

# Combine the four plots (no legend)
plots_combined <- cowplot::plot_grid(
  cit, deg, hub, eig,
  ncol = 2, align = "hv"
)


plots_combined


width <- 8 

ggsave("fourwayplot_Y02.png",plots_combined, width = width, height = width/1.618, units = "in")
```







```{r}



```




**K-Cores** 

To more formally evaluate the importance of federal agencies within this network, 
the K-core algorithm. 
To find the most important nodes within a network, this approach successively eliminates less
connected nodes from the network (which also reduces the number of connections of the remaining nodes), 
to reveal the core of the network without which there would not be a stable network at all. 
Hoffmann, 2021 
The core represents the most tightly interlinked subnetwork of nodes without which the entire network would seize to exist. 

k = 1 means what?
We remove all nodes that have just one citation connection!


```{r}
# Calculate the coreness for each node
core_values <- coreness(net)

# Find the maximum k-core value
max_core <- max(core_values)
```

```{r}
# Define k value
k <- 365

# Extract the k-core subgraph
g_kcore <- induced_subgraph(net, vids = which(core_values >= k))


```

Core - Number of nodes 


```{r}
library(igraph)
library(tidygraph)
library(ggraph)
library(ggrepel)


k <- 365

g_kcore <- induced_subgraph(net, vids = which(core_values == k))

V(g_kcore)$lab <- ifelse(
  V(g_kcore)$label %in% c(
    "Department of Defense", 
    "Department of Energy", 
    "NASA", 
    "United States Government", 
    "National Science Foundation"
  ), 
  ifelse(
    V(g_kcore)$label == "United States Government", 
    "Federal Government", 
    V(g_kcore)$label
  ), 
  NA
)

g_kcore_tbl <- as_tbl_graph(g_kcore)

k_core <- ggraph(g_kcore_tbl, layout = "drl") +
  geom_edge_link(color = "lightgray", alpha = 0.25) +
  geom_node_point(aes(color = V(g_kcore)$type), size = 4, show.legend = FALSE) +
  geom_text_repel(
    aes(x = x, y = y, label = lab), 
    color = "black", 
    size = 3, 
    box.padding = 0.35, 
    point.padding = 0.5, 
    segment.color = "gray50"
  ) +
  scale_color_manual(values = c(
    "Firm" = "purple", 
    "University" = "yellow", 
    "National Lab" = "green"
  )) +
  theme_void()




width <- 8 

ggsave("k-cores_Y02.png",k_core, width = width, height = width/1.618, units = "in")
```

*List of Core*

```{r}
V(g_kcore)$label
```


```{r}

institutions <- c(
  "Department of Energy",
  "National Science Foundation",
  "Boeing",
  "United States Government",
  "Ford Global Technologies, LLC",
  "Google LLC",
  "GM Global Technology Operations LLC",
  "Caterpillar Inc.",
  "Texas Instruments Incorporated",
  "Department of Defense",
  "United Technologies Corporation",
  "General Electric Company",
  "Honeywell International Inc.",
  "Hamilton Sundstrand Corporation",
  "Applied Materials, Inc.",
  "General Motors LLC",
  "Micron Technology, Inc.",
  "Intel Corporation",
  "International Business Machines Corporation",
  "Delphi Technologies",
  "NASA",
  "Motorola, Inc.",
  "Hewlett-Packard Development Company, L.P.",
  "Sun Microsystems Inc.",
  "Dell Products L.P.",
  "Advanced Micro Devices, Inc.",
  "Apple Inc.",
  "Microsoft Corporation",
  "Broadcom Corporation",
  "Rambus Inc.",
  "Lucent Technologies Inc.",
  "Lockheed Martin Corporation",
  "Cisco Technology, Inc.",
  "Massachusetts Institute of Technology",
  "3M Innovative Properties Company",
  "Rockwell Automation Technologies, Inc.",
  "Cypress Semiconductor Corporation",
  "Amazon Technologies, Inc.",
  "AT&T Intellectual Property I, L.P.",
  "Qualcomm Incorporated",
  "Johnson Controls Technology Company",
  "Fisher-Rosemount Systems, Inc",
  "NVIDIA Corporation",
  "Hewlett-Packard Company, L.P.",
  "Microsoft Technology Licensing, LLC"
)



```

```{r}

# Create a comma-separated string with line breaks after every few items
institutions_str <- paste(institutions, collapse = ", ")
formatted_institutions <- strwrap(institutions_str, width = 80)
formatted_text <- paste(formatted_institutions, collapse = "\n")

# Load the necessary package for exporting text as an image
library(grid)
library(gridExtra)

# Convert the formatted text into a grid graphic
text_plot <- textGrob(formatted_text, x = 0.5, y = 0.5, just = "center", gp = gpar(fontsize = 10))

# Save the plot as a PNG file in A4 dimensions
png("institutions_list_A4.png", width = 2480, height = 3508, res = 300)  # A4 size at 300 dpi
grid.draw(text_plot)
dev.off()


```







```{r}
library(intergraph)
library(ergm.count)
```



*Create Government node attribute*
```{r}
nodes <- nodes %>%
  mutate(attribute = if_else(type == "National Lab", 1,0))

net <- graph_from_data_frame(d = edges_excl, vertices = nodes, directed = T)

```


*Filter network to nodes with at least one outgoing citation* 
```{r}

outdeg <- degree(net, mode = "out")

# Filter the network to only include these nodes
net <- induced_subgraph(net, vids = V(net)[outdeg > 0])

```



*Create a label for citing a government node*

```{r}
# Identify nodes with the attribute of interest
gov_nodes <- V(net)[attribute == 1]


# Create an attribute for outdegree value 
V(net)$outdegree <- degree(net, mode = "out")

V(net)$indegree <- degree(net, mode = "in")

```


```{r}
# Set cites_gov to 1 for nodes with incoming ties from governmental nodes

V(net)$cites_gov <- sapply(V(net), function(v) {
  any(gov_nodes %in% neighbors(net, v, mode = "in"))
})
# Convert logical TRUE/FALSE to binary 1/0
V(net)$cites_gov <- as.numeric(V(net)$cites_gov)

```

```{r}
net_network <- asNetwork(net)
```


Do nodes that cite government nodes tend to receive more citations themselves?

*Basic Model*
```{r}
model <- ergm(net_network ~ edges + nodeicov("cites_gov"))
summary(model)
```

edges              -10.495538   0.005460      0 -1922.4  <0.0001 ***
nodeicov.cites_gov   3.620144   0.006026      0   600.7  <0.0001 ***










*Controlling for Degree Value*
```{r}
model_2 <- ergm(net_network ~ edges + nodeicov("cites_gov") + nodeocov("outdegree"))
summary(model_2)
```
edges              -1.068e+01  5.800e-03      0 -1841.5   <1e-04 ***
nodeicov.cites_gov  3.768e+00  6.262e-03      0   601.8   <1e-04 ***
nodeocov.outdegree  1.390e-03  1.987e-06      0   699.2   <1e-04 ***


```{r}
model_3 <- ergm(net_network ~ edges + nodeicov("cites_gov") + nodeocov("outdegree") +
                nodeicov("indegree"))
summary(model_3)
```

edges              -10.742082321   0.005978250      0 -1796.9  <0.0001 ***
nodeicov.cites_gov   3.574409821   0.006499515      0   550.0  <0.0001 ***
nodeocov.outdegree   0.001402634   0.000001982      0   707.6  <0.0001 ***
nodeicov.indegree    0.000716334   0.000001127      0   635.6  <0.0001 ***








**Network Resiliency Statistics**

```{r}
# Calculate outdegree for each node
outdegrees <- degree(net, mode = "out")

# Take the mean
average_outdegree <- mean(outdegrees)

average_outdegree

```
```{r}
indegrees <- degree(net, mode = "in")

# Take the mean
average_indegree <- mean(indegrees)

average_indegree
```
```{r}
average_path_length <- mean_distance(net, directed = TRUE)

average_path_length
```


```{r}
# Calculate average local clustering coefficient
average_clustering <- transitivity(net, type = "average")

average_clustering
```


```{r}
net_undirected <- as.undirected(net, mode = "collapse")

communities <- cluster_fast_greedy(net_undirected)

# Modularity
modularity(communities)

```


```{r}
library(igraph)

sub_net <- induced_subgraph(net, vids = V(net)[type != "National Lab"])


```

```{r}
indegrees <- degree(sub_net, mode = "in")

average_indegree <- mean(indegrees)

average_indegree
```

```{r}
average_path_length <- mean_distance(sub_net, directed = TRUE)

average_path_length
```

```{r}
average_clustering <- transitivity(sub_net, type = "average")

average_clustering
```

```{r}
net_undirected <- as.undirected(sub_net, mode = "collapse")

communities <- cluster_fast_greedy(net_undirected)

# Modularity
modularity(communities)

```
